import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from six import StringIO
from sklearn.tree import plot_tree, DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, precision_score, mean_squared_error
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex
## graphviz
import graphviz
## seaborn
import seaborn as sns
sns.set_context("paper", rc={"font.size":12,"axes.titlesize":14,"axes.labelsize":12})
sns.set_style("whitegrid")
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
import matplotlib.gridspec as gridspec
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
# Graphics in retina format
%config InlineBackend.figure_format = 'retina'
## pydot
import pydot
import warnings
warnings.filterwarnings("ignore")
This dataset can be extracted from ISLR R package.
Hitters = pd.read_csv('Data/Hitters.csv', index_col=0).dropna()
Hitters.head()
fig = go.Figure()
fig = make_subplots(rows=1, cols=2,
subplot_titles=('Salary Histrogram', 'log(Salary) Histrogram'))
# Left,
fig.add_trace(go.Histogram(x= Hitters['Salary'].values, xbins=dict(size=100), showlegend=False), 1, 1)
# right
fig.add_trace(go.Histogram(x= np.log(Hitters['Salary'].values), xbins=dict(size=0.2), showlegend=False), 1, 2)
# Update
fig.update_xaxes(title_text='Salary', range=[0, 2500], row=1, col=1)
fig.update_yaxes(title_text='Frequency', range=[0, 50], row=1, col=1)
fig.update_xaxes(title_text='log(Salary)', range=[4, 8], row=1, col=2)
fig.update_yaxes(title_text='Frequency', range=[0, 50], row=1, col=2)
fig.update_traces(marker_color='RoyalBlue', marker_line_color='Navy', marker_line_width=1.5, opacity=1, row=1, col=1)
fig.update_traces(marker_color='lightYellow', marker_line_color='darkRed', marker_line_width=1.5, opacity=1, row=1, col=2)
# Background
fig.update_layout(plot_bgcolor= 'white')
fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True)
fig.show()
We would like to define a function that summarized our regressor tree. This can be done using the following packages.
| Package | Description |
|---|---|
| sklearn.tree.export_graphviz | Exports a decision tree in the DOT format. |
| StringIO | io.StringIO in Python 3 |
| pydot | Visualizes the graph |
def Tree_Graph(Estimator, Features, Class_Names = None, fill=True):
dot_data = StringIO()
export_graphviz(Estimator, out_file = dot_data, feature_names = Features, class_names = Class_Names, filled = fill)
graph, = pydot.graph_from_dot_data(dot_data.getvalue())
graph = Image(graph.create_png())
display(graph)
The Hitters dataset can be used for predicting a baseball player’s Salary based on Years (the number of years that he has played in the major leagues) and Hits (the number of hits that he made in the previous year).
For the Hitters data, a regression tree for predicting the log salary of a baseball player, based on the number of years that he has played in the major leagues and the number of hits that he made in the previous year. At a given internal node, the label (of the form $X_j < t_k$) indicates the left-hand branch emanating from that split, and the right-hand branch corresponds to $Xj \geq tk$. For instance, the split at the top of the tree results in two large branches. The left-hand branch corresponds to $Years< 4.5$, and the right-hand branch corresponds to $Years\geq 4.5$. The tree has two internal nodes and three-terminal nodes, or leaves. The number in each leaf is the mean of the response for the observations that fall there.
X = Hitters[['Years', 'Hits']].values
y = np.log(Hitters.Salary.values)
reg = DecisionTreeRegressor(max_leaf_nodes = 3)
_ = reg.fit(X, y)
Tree_Graph(reg, Features=['Years', 'Hits'])
Overall, the tree segments the players into three regions of predictor space:
fig, ax = plt.subplots(1, 1, figsize=(16, 8))
_ = ax.scatter(Hitters['Years'], Hitters['Hits'], facecolors='RoyalBlue', edgecolors='Navy', alpha = 0.4)
_ = ax.set_xlabel('Years')
_ = ax.set_ylabel('Hits')
_ = ax.set_xticks([1, 4.5, 24])
_ = ax.set_yticks([1, 117.5, 238])
_ = ax.set_xlim([0, 25])
_ = ax.set_ylim([-1, 250])
# Year > = 4.5
_ = ax.vlines(4.5 , 0, 350, linestyles= 'dashed', lw = 1.5, colors='Black')
# Hitts > = 117.5
_ = ax.hlines(117.5 , 4.5, 350, linestyles= 'dashed', lw= 1.5, colors='Black')
_ = ax.annotate(r'$R_1$', xy=(2,117.5), fontsize='xx-large')
_ = ax.annotate(r'$R_2$', xy=(11,60), fontsize='xx-large')
_ = ax.annotate(r'$R_3$', xy=(11,170), fontsize='xx-large')
_ = ax.fill_between(np.linspace(0, 4.5, 200), 1, 238, color='LimeGreen',alpha=.1)
_ = ax.fill_between(np.linspace(4.5, 24, 200), 117.5, 238, color='Purple',alpha=.1)
_ = ax.fill_between(np.linspace(4.5, 24, 200), 1, 117.5, color='Salmon',alpha=.1)
There are two steps:
In order to perform recursive binary splitting, we consider all predictors $X_1$, $X_2$, $\ldots$ , $X_p$, and all possible values of the cutpoint s for each of the predictors, and then choose the predictor and cutpoint such that the resulting tree has the lowest RSS.
From the textbook, we have an algorithm for building a Regression Tree, Algorithm 8.1.
In terms of python implementations, there is a in-depth article by Sckit-learn regarding Decision Trees.
A classification tree is very similar to a regression tree, except that it is classification tree used to predict a qualitative response rather than a quantitative one.
These data contain a binary outcome AHD for 303 patients who presented with chest pain. An outcome value of Yes indicates the presence of heart disease based on an angiographic test, while No means no heart disease. There are 13 predictors including Age, Sex, Chol (a cholesterol measurement), and other heart and lung function measurements. Cross-validation results in a tree with six terminal nodes.
Dataset available on at this link
Heart = pd.read_csv('http://faculty.marshall.usc.edu/gareth-james/ISL/Heart.csv').drop('Unnamed: 0', axis=1).dropna()
Heart.head()
We can use Pandas Factorize to encode categorical variables as follows,
$$\mbox{Chest Pain} = \begin{cases} 0,&\mbox{Typical},\\ 1,&\mbox{Asymptomatic},\\ 2,&\mbox{Non-Anginal},\\ 3,&\mbox{Non-Typical}. \end{cases}, \qquad \mbox{Thal} = \begin{cases} 0,&\mbox{Fixed},\\ 1,&\mbox{Normal},\\ 2,&\mbox{Reversable}. \end{cases}, \qquad \mbox{AHD} = \begin{cases} 0,&\mbox{No},\\ 1,&\mbox{Yes}. \end{cases}. $$Heart['ChestPain'] = pd.factorize(Heart['ChestPain'])[0]
Heart['Thal'] = pd.factorize(Heart['Thal'])[0]
Heart['AHD'] = pd.factorize(Heart['AHD'])[0]
$X$ and $y$ sets, and sklearn DecisionTreeClassifier
X = Heart.drop('AHD', axis=1)
y = Heart['AHD']
clf = DecisionTreeClassifier(max_depth=None, max_leaf_nodes=6, max_features=3)
_ = clf.fit(X,y)
Tree_Graph(clf, Features=X.columns.tolist(), Class_Names=['No', 'Yes'])
where $R_1$, . . . , $R_M$ represent a partition of feature space.
This dataset can be extracted from ISLR R package.
Carseats = pd.read_csv('Data/Carseats.csv').drop('Unnamed: 0', axis=1).dropna()
Carseats.head()
We create a variable High which takes on a value of Yes if the Sales variable exceeds 8, and takes on a value of No otherwise.
Carseats['High'] = Carseats['Sales'].map(lambda x: 'Yes' if x > 8 else 'No')
We can encode categorical variables as follows,
$$\mbox{Chest Pain} = \begin{cases} 0,&\mbox{Bad},\\ 1,&\mbox{Medium},\\ 2,&\mbox{Good}. \end{cases}, \qquad \mbox{Urban} = \begin{cases} 0,&\mbox{No},\\ 1,&\mbox{Yes}. \end{cases}, \qquad \mbox{US} = \begin{cases} 0,&\mbox{No},\\ 1,&\mbox{Yes}. \end{cases}, \qquad \mbox{High} = \begin{cases} 0,&\mbox{No},\\ 1,&\mbox{Yes}. \end{cases}. $$Class_Names = list(np.sort(Carseats['High'].unique()))
#
Carseats['ShelveLoc'] = Carseats['ShelveLoc'].replace({'Bad':0, 'Medium':1, 'Good':2}).astype(int)
Carseats['US'] = pd.factorize(Carseats['US'], sort = True)[0]
Carseats['Urban'] = pd.factorize(Carseats['Urban'], sort = True)[0]
Carseats['High'] = pd.factorize(Carseats['High'], sort = True)[0]
Now,
Carseats.head()
X = Carseats.drop(['Sales', 'High'], axis=1)
y = Carseats.High
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
clf = DecisionTreeClassifier(max_depth=6)
_ = clf.fit(X_train, X_test)
y_pred = clf.fit(X_train, y_train).predict(X_test)
Results = pd.DataFrame(classification_report(y_test, y_pred,
target_names = Class_Names, output_dict=True)).T
display(Results.round(2))
print(Back.CYAN + Fore.BLACK + Style.BRIGHT + 'Confusion Matrix' + Style.RESET_ALL)
display(pd.DataFrame(data = confusion_matrix(y_test, y_pred), index = Class_Names, columns = Class_Names))
fig, ax = plt.subplots(1, 2, figsize=(14, 4))
_ = plot_confusion_matrix(clf, X_test, y_test, display_labels= Class_Names, cmap= "Blues", normalize= None, ax = ax[0])
_ = ax[0].set_title('Confusion Matrix')
_ = plot_confusion_matrix(clf, X_test, y_test, display_labels= Class_Names, cmap= "Greens", normalize= 'true', ax = ax[1])
_ = ax[1].set_title('Normalized Confusion Matrix')
Tree_Graph(clf, Features=X.columns.tolist(), Class_Names = Class_Names)
Boston = pd.read_csv('Data/Boston.csv').drop('Unnamed: 0', axis=1).dropna()
Boston.head()
X = Boston.drop('medv', axis=1)
y = Boston.medv
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)
reg = DecisionTreeRegressor(max_depth=3)
_ = reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
Tree_Graph(reg, Features=X.columns.tolist())
fig, ax = plt.subplots(1, 1, figsize=(6, 6))
_ = ax.scatter(y_pred, y_test, label='medv', facecolors='SkyBlue', edgecolors='MidnightBlue', alpha = 0.8)
_ = ax.plot([0, 1], [0, 1], '--k', transform=plt.gca().transAxes)
_ = ax.set_xlabel(r'$y_{pred}$')
_ = ax.set_ylabel(r'$y_{test}$')
_ = ax.set_xlim([-1,51])
_ = ax.set_ylim([-1,51])
James, G., Witten, D., Hastie, T., & Tibshirani, R. (2013). An introduction to statistical learning (Vol. 112, pp. 3-7). New York: springer.
Jordi Warmenhoven, ISLR-python